<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    
    
    <title>js正则表达式之爬虫 | 追光者的部落 | 种一棵树最好的时间是十年之前,其次是现在</title>
    <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
    
    <meta name="keywords" content="JavaScript">
    <link rel="shortcut icon" href="/img/favicon.ico">
    <link rel="stylesheet" href="/css/style.css?v=1.0.0">
    
    <script type="text/javascript">
        // Data Center
        var DC = {
            reward:	true,
            lv: JSON.parse('{"enable":false,"app_id":null,"app_key":null,"icon":true}'),
            v: JSON.parse('{"enable":true,"appid":"OCT05YTTqCrexJcyc5Ay0XXD-gzGzoHsz","appkey":"L6b0lDuLhBWOEUhEYmfA7g7C","notify":false,"verify":false,"placeholder":"suggest","avatar":"wavatar"}'),
            g: JSON.parse('{"enable":false,"lazy":true,"owner":"longmartin","repo":null,"oauth":{"client_id":null,"client_secret":null},"perPage":10}'),
            d: JSON.parse('{"app_id":null}')
        };
    </script>
    <script type="text/javascript">
        window.lazyScripts=[];
    </script>
    
</head>


<body>
    <div id="loading" class="active"></div>

    <aside id="menu" class="hide" >
  <div class="inner flex-row-vertical">
    <a href="javascript:;" class="header-icon waves-effect waves-circle waves-light" id="menu-off">
        <i class="icon icon-lg icon-close"></i>
    </a>
    <div class="brand-wrap">
      
      <img src="/img/brand.jpg" class="brand-bg">
      
      <div class="brand">
        <a href="/" class="avatar waves-effect waves-circle waves-light">
          <img src="/img/avatar.jpg">
        </a>
        <hgroup class="introduce">
          <h5 class="nickname">Martin Long</h5>
          <a href="mailto:1156341485@qq.com" title="1156341485@qq.com" class="mail">
            
              <span>1</span>
            
              <span>1</span>
            
              <span>5</span>
            
              <span>6</span>
            
              <span>3</span>
            
              <span>4</span>
            
              <span>1</span>
            
              <span>4</span>
            
              <span>8</span>
            
              <span>5</span>
            
              <span>@</span>
            
              <span>q</span>
            
              <span>q</span>
            
              <span>.</span>
            
              <span>c</span>
            
              <span>o</span>
            
              <span>m</span>
            
          </a>
        </hgroup>
        
        <ul class="menu-link">
          
              <li>
                <a href="https://github.com/longmartin" target="_blank">
                  <i class="icon icon-lg icon-github"></i>
                </a>
              </li>
            
        </ul>
        
      </div>
    </div>
    <div class="scroll-wrap flex-col">
      <ul class="nav">
        
            <li class="">
              <a href="/"  >
                <i class="icon icon-lg icon-home"></i>
                HOME
              </a>
            </li>
        
            <li class="">
              <a href="/categories"  >
                <i class="icon icon-lg icon-th-list"></i>
                分类
              </a>
            </li>
        
            <li class="">
              <a href="/tags"  >
                <i class="icon icon-lg icon-tags"></i>
                标签
              </a>
            </li>
        
            <li class="">
              <a href="/archives"  >
                <i class="icon icon-lg icon-archives"></i>
                文档
              </a>
            </li>
        
            <li class="">
              <a href="https://github.com/longmartin" target="_blank" >
                <i class="icon icon-lg icon-github"></i>
                Github
              </a>
            </li>
        
            <li class="">
              <a href="https://www.jianshu.com/u/696184ca9e21" target="_blank" >
                <i class="icon icon-lg icon-link"></i>
                简书
              </a>
            </li>
        
      </ul>
    </div>
  </div>
</aside>

    <main id="main">
        <header class="top-header" id="header">
    <div class="flex-row clearfix">
        <a href="javascript:;" class="header-icon pull-left waves-effect waves-circle waves-light on" id="menu-toggle">
          <i class="icon icon-lg icon-navicon"></i>
        </a>
        <div class="flex-col header-title ellipsis">
            <span>js正则表达式之爬虫</span>
            
        </div>
        
        <a href="javascript:;" id="site_search_btn" class="header-icon pull-right waves-effect waves-circle waves-light">
            <i class="icon icon-lg icon-search"></i>
        </a>
        
    </div>
</header>
<header class="content-header post-header">
    <div class="container fade-scale">
        <h1 class="title">js正则表达式之爬虫</h1>
        <h5 class="subtitle">
            
                <time datetime="2018-06-17T09:27:27.000Z" itemprop="datePublished" class="page-time">
  2018-06-17
</time>


	<ul class="article-category-list"><li class="article-category-list-item"><a class="article-category-list-link" href="/categories/frontend/">大前端</a></li></ul>

            
        </h5>
        
    </div>
    

</header>

<div id="site_search">
    <div class="search-title clearfix">
        <span class="pull-left">
          <i class="icon icon-lg icon-search"></i>
        </span>
        <input type="text" id="local-search-input" name="q" results="0" placeholder="search my blog..." class="form-control pull-left"/>
        <a href="javascript:;" class="close pull-right waves-effect waves-circle waves-light">
          <i class="icon icon-lg icon-close"></i>
        </a>
    </div>
    <div id="local-search-result"></div>
</div>


<div class="container body-wrap">
    <article id="post-20180617-14"
  class="post-article article-type-post" itemprop="blogPost">
    <div class="post-card">
        <h1 class="post-card-title">js正则表达式之爬虫</h1>
        <div class="post-meta">
            <time class="post-time" title="2018-06-17 09:27:27" datetime="2018-06-17T09:27:27.000Z"  itemprop="datePublished">2018-06-17</time>

            
	<ul class="article-category-list"><li class="article-category-list-item"><a class="article-category-list-link" href="/categories/frontend/">大前端</a></li></ul>



            

            
    <span class="leancloud-comment">
        <i class="icon icon-comment-o"></i>
        <a href="/frontend/20180617-14.html#comment">
            <span class="valine-comment-count" data-xid="/frontend/20180617-14.html"></span>
        </a>
    </span>



            
        </div>
        <div class="post-content" id="post-content" itemprop="postContent">
            
            <h3 id="前言"><a href="#前言" class="headerlink" title="前言"></a>前言</h3><p>通过爬虫爬去网页的dom之后就需要用正则来获取相关的信息，下面来说下正则表达式在爬虫中的应用</p>
<h3 id="匹配"><a href="#匹配" class="headerlink" title="匹配"></a>匹配</h3><blockquote>
<p>对于Html来说，一般需要匹配的就是相应DOM结构，比如标签属性，内容和数目之类的。</p>
</blockquote>
<p>拿一个简单的标签来说比如<code>&lt;span&gt;username&lt;/span&gt;</code>，我们想要拿到<code>username</code>这个内容来用正则实现。<br>这里要先补充一些需要使用的或者比较关键的正则关键词：</p>
<ul>
<li>自定义匹配多种字符 - <code>[ ]</code><ul>
<li>使用方括号 [ ] 包含一系列字符，能够匹配其中任意一个字符。</li>
<li>用 [^ ] 包含一系列字符，则能够匹配其中字符之外的任意一个字符。</li>
<li>虽然可以匹配其中任意一个，但是只能是一个，不是多个。</li>
</ul>
</li>
<li>修饰匹配次数 - <code>{}</code>，<code>?</code>，<code>+</code>，<code>*</code><ul>
<li><strong>{n}</strong> - 表达式重复n次，比如：”/w{2}” 相当于 “/w/w”；”a{5}” 相当于 “aaaaa”</li>
<li><strong>{m,n}</strong> - 表达式至少重复m次，最多重复n次，比如：”ba{1,3}”可以匹配 “ba”或”baa”或”baaa”</li>
<li><strong>{m,}</strong> - 表达式至少重复m次，比如：”/w/d{2,}”可以匹配 “a12”,”_456”,”M12344”…</li>
<li><strong>?</strong> - 匹配表达式0次或者1次，相当于 {0,1}，比如：”a[cd]?”可以匹配 “a”,”ac”,”ad”</li>
<li><strong>+</strong> - 表达式至少出现1次，相当于 {1,}，比如：”a+b”可以匹配 “ab”,”aab”,”aaab”…</li>
<li><strong>*</strong> - 表达式不出现或出现任意次，相当于 {0,}，比如：”/^*b”可以匹配 “b”,”^^^b”…</li>
</ul>
</li>
<li>用于group的字符 - <code>()</code><ul>
<li>在被修饰匹配次数的时候，括号中的表达式可以作为整体被修饰</li>
<li>取匹配结果的时候，括号中的表达式匹配到的内容可以被单独得到</li>
</ul>
</li>
<li>用于扩展表达式含义<ul>
<li><strong>g</strong>：代表可以进行全局匹配</li>
<li><strong>i</strong>：代表不区分大小写匹配</li>
<li><strong>m</strong>：代表可以进行多行匹配</li>
</ul>
</li>
</ul>
<p>上面几个概念就是我们所需要的所有东西了。</p>
<p>接下里开始匹配span标签并且取出里面的内容<br><figure class="highlight plain"><table><tr><td class="code"><pre><span class="line">&lt;span&gt;username&lt;/span&gt;</span><br></pre></td></tr></table></figure></p>
<p>使用下面的正则即可，其中非贪婪模式很简单的避免了多个相同标签时匹配不正确的问题<br><figure class="highlight plain"><table><tr><td class="code"><pre><span class="line">/&lt;span&gt;(.*?)&lt;\/span&gt;/</span><br></pre></td></tr></table></figure></p>
<p>如果要取出某些属性的标签比如<br><figure class="highlight plain"><table><tr><td class="code"><pre><span class="line">&lt;span id=&quot;user&quot;&gt;username&lt;/span&gt;</span><br></pre></td></tr></table></figure></p>
<p>则可以使用如下正则<br><figure class="highlight plain"><table><tr><td class="code"><pre><span class="line">/&lt;span id=&quot;(.*?)&quot;&gt;(.*?)&lt;\/span&gt;/</span><br></pre></td></tr></table></figure></p>
<p>这样就能够简单快速解决html里面各种标签的数据获取问题</p>
<h3 id="筛选"><a href="#筛选" class="headerlink" title="筛选"></a>筛选</h3><p>对于匹配出来的数据，如果我们需要全局匹配，可以使用<code>String.match(RegExp)</code>方法。不过该方法只能获取一次的匹配结果，如果需要匹配大段文本中多次的结果就需要使用<code>RegExp.exec(string)</code>方法进行匹配。<br><figure class="highlight plain"><table><tr><td class="code"><pre><span class="line">var html = &apos;&lt;span id=&quot;sp_1&quot;&gt;a&lt;/span&gt;&lt;span id=&quot;sp_2&quot;&gt;b&lt;/span&gt;&lt;span id=&quot;sp_3&quot;&gt;c&lt;/span&gt;&apos;;</span><br><span class="line">var reg = /&lt;span id=&quot;(.*?)&quot;&gt;(.*?)&lt;\/span&gt;/g;</span><br><span class="line">var result;</span><br><span class="line">while ((result = reg.exec(html)) != null)  &#123;</span><br><span class="line">      console.log(result[1],result[2])</span><br><span class="line">&#125;</span><br><span class="line">//sp_1 a</span><br><span class="line">//sp_2 b</span><br><span class="line">//sp_3 c</span><br></pre></td></tr></table></figure></p>
<p>这样，对于绝大部分网页内容分析筛选的工作都可以使用简单的正则快速的完成了。</p>

        </div>
        
<blockquote class="post-copyright">
    <div class="content">
        
<span class="post-time">
    Last updated: <time datetime="2018-06-17T01:31:44.909Z" itemprop="dateUpdated">2018-06-17 01:31:44</time>
</span><br>


        
        转载注明出处，原文地址：<a href="/frontend/20180617-14.html" target="_blank" rel="external">http://blog.inbelieve.top/frontend/20180617-14.html</a>
        
    </div>
    <footer>
        <a href="http://blog.inbelieve.top">
            <img src="/img/avatar.jpg" alt="Martin Long">
            Martin Long
        </a>
    </footer>
</blockquote>

        
            <div class="page-reward">
    <a id="rewardBtn" href="javascript:;" class="page-reward-btn waves-effect waves-circle waves-light">赏</a>
</div>

            
        
        <div class="post-footer">
            
	<ul class="article-tag-list"><li class="article-tag-list-item"><a class="article-tag-list-link" href="/tags/javascript/">JavaScript</a></li></ul>

            <div class="page-share-wrap">
    

<div class="page-share" id="pageShare">
    <ul class="reset share-icons">
      
      <li>
        <a class="weixin share-sns wxFab" href="javascript:;" data-title="微信">
          <i class="icon icon-weixin"></i>
        </a>
      </li>
      
      <li>
        <a class="qq share-sns" target="_blank" href="http://connect.qq.com/widget/shareqq/index.html?url=http://blog.inbelieve.top/frontend/20180617-14.html&title=《js正则表达式之爬虫》 — 追光者的部落&source=有些鸟儿天生是没办法被关在笼子里的,因为他的每一片羽毛都闪耀着自由的光辉" data-title=" QQ">
          <i class="icon icon-qq"></i>
        </a>
      </li>
      
    </ul>
 </div>



    <a href="javascript:;" id="shareFab" class="page-share-fab waves-effect waves-circle">
        <i class="icon icon-share-alt icon-lg"></i>
    </a>
</div>

        </div>
        
            
    <div id="comment"></div>



        
    </div>
    
<nav class="post-nav flex-row flex-justify-between">
  
    <div class="prev">
      <a href="/frontend/20180617-15.html" id="post-prev" class="post-nav-link">
        <div class="tips"><i class="icon icon-angle-left icon-lg icon-pr"></i> Prev</div>
        <h4 class="title">js数组之forEach,every,some,filter,map的区别</h4>
      </a>
    </div>
  

  
    <div class="next">
      <a href="/frontend/20180617-13.html" id="post-next" class="post-nav-link">
        <div class="tips">Next <i class="icon icon-angle-right icon-lg icon-pl"></i></div>
        <h4 class="title">nodejs之pm2</h4>
      </a>
    </div>
  
</nav>


    
    
        <aside class="post-widget">
            <nav class="post-toc-wrap" id="post-toc">
                <strong>目录</strong>
                <ol class="post-toc"><li class="post-toc-item post-toc-level-3"><a class="post-toc-link" href="#前言"><span class="post-toc-number">1.</span> <span class="post-toc-text">前言</span></a></li><li class="post-toc-item post-toc-level-3"><a class="post-toc-link" href="#匹配"><span class="post-toc-number">2.</span> <span class="post-toc-text">匹配</span></a></li><li class="post-toc-item post-toc-level-3"><a class="post-toc-link" href="#筛选"><span class="post-toc-number">3.</span> <span class="post-toc-text">筛选</span></a></li></ol>
            </nav>
            <div class="toc-bar"><div>
        </aside>
    
</article>

    <div id="reward" class="page-modal reward-lay">
    <a class="close" href="javascript:;"><i class="icon icon-close"></i></a>
    <h3 class="reward-title">
            请作者喝杯咖啡
    </h3>
    <div class="reward-content">
        
        <div class="reward-code">
            <img id="rewardCode" src="/img/wechat.jpg" alt="打赏二维码">
        </div>
        
        <label class="reward-toggle">
            <input id="rewardToggle" type="checkbox" class="reward-toggle-check"
                data-wechat="/img/wechat.jpg" data-alipay="/img/alipay.jpg">
            <div class="reward-toggle-ctrol">
                <span class="reward-toggle-item wechat">微信</span>
                <span class="reward-toggle-label"></span>
                <span class="reward-toggle-item alipay">支付宝</span>
            </div>
        </label>
        
    </div>
</div>

    
</div>

        <footer class="footer">
    
    <div class="bottom">
        <p>
            <span>
                Martin Long &copy; 2017 - 2018
            </span>
        		
           	
            
            
            <span>
	            Power by <a href="http://hexo.io/" target="_blank">Hexo</a> Theme <a href="https://github.com/codefine/hexo-theme-mellow" target="_blank">book</a>
            </span>
            
            
            
        </p>
    </div>
</footer>

    </main>
    <div class="mask" id="mask"></div>
<a href="javascript:;" id="gotop" class="waves-effect waves-circle waves-light"><span class="icon icon-lg icon-chevron-up"></span></a>



<div class="global-share" id="globalShare">
    <ul class="reset share-icons">
      
      <li>
        <a class="weixin share-sns wxFab" href="javascript:;" data-title="微信">
          <i class="icon icon-weixin"></i>
        </a>
      </li>
      
      <li>
        <a class="qq share-sns" target="_blank" href="http://connect.qq.com/widget/shareqq/index.html?url=http://blog.inbelieve.top/frontend/20180617-14.html&title=《js正则表达式之爬虫》 — 追光者的部落&source=有些鸟儿天生是没办法被关在笼子里的,因为他的每一片羽毛都闪耀着自由的光辉" data-title=" QQ">
          <i class="icon icon-qq"></i>
        </a>
      </li>
      
    </ul>
 </div>


<div class="page-modal wx-share" id="wxShare">
    <a class="close" href="javascript:;"><i class="icon icon-close"></i></a>
    <p>扫一扫，分享到微信</p>
    <img src="" alt="微信分享二维码">
</div>


    
    <!-- main-js -->
<script type="text/javascript" src="//cdn.bootcss.com/jquery/2.0.0/jquery.min.js"></script>
<script type="text/javascript" src="/js/plugins/fastclick.js?v=1.0.0"></script>

<script type="text/javascript" src="https://cdn.bootcss.com/node-waves/0.7.4/waves.min.js"></script>

<script type="text/javascript" src="/js/method.js?v=1.0.0"></script>
<script type="text/javascript" src="/js/blog.js?v=1.0.0"></script>

<!-- third-party -->





<script type="text/javascript" src="/js/plugins/local_search.js?v=1.0.0"></script>
<script type="text/javascript">
	var search_path = "search.xml";
	if (search_path.length === 0) {
		search_path = "search.xml";
	}
	var path = "/" + search_path;
	searchFunc(path, "local-search-input", "local-search-result");
</script>



    
        <script type="text/javascript" src="https://cdn1.lncld.net/static/js/3.0.4/av-min.js"></script>
<script type="text/javascript" src="//unpkg.com/valine/dist/Valine.min.js"></script>
<script type="text/javascript" src="/js/plugins/valine.js?v=1.0.0"></script>
    
    







    <script>
    (function() {
        var OriginTitile = document.title, titleTime;
        document.addEventListener('visibilitychange', function() {
            if (document.hidden) {
                document.title = 'leaving！';
                clearTimeout(titleTime);
            } else {
                document.title = 'welcome!';
                titleTime = setTimeout(function() {
                    document.title = OriginTitile;
                },2000);
            }
        });
    })();
</script>




    
</body>
</html>
